import pymongo
import requests
import re
import os
import time

root = "./images"
if not os.path.exists(root):
    os.mkdir(root)
cctv = os.path.join(root, "cctv")
if not os.path.exists(cctv):
    os.mkdir(cctv)

client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["cctv_db"]
collection = db["cctv_collection"]
n_豆瓣
for page in range(1,8):
    url = f"https://news.cctv.com/2019/07/gaiban/cmsdatainterface/page/news_{page}.jsonp?cb=news"
    response = requests.get(url)
    response.encoding = response.apparent_encoding
    htmls = response.text
    items = re.findall(r'\{.*?\}', htmls)
    for item in items:
        title = re.search(f'"title":"(.*?)",',item,re.S)
        image = re.search(f'"image":"(.*?)",', item, re.S)
        brief = re.search(f'"brief":"(.*?)",', item, re.S)
        print(f"正在爬取{title.group(1)}数据...")
        img_res = requests.get(image.group(1))
        image_filename = os.path.join(cctv, os.path.basename(image.group(1)))
        with open( image_filename, "wb") as f:
            f.write(img_res.content)
        news_data = {
            "title": title.group(1),
            "brief": brief.group(1),
            "image_path": image.group(1)
        }
        collection.insert_one(news_data)
        time.sleep(1)
client.close()